In [24]:
import pandas as pd
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import plotnine as gg

#matplotlib.style.use('ggplot')
matplotlib.style.use('seaborn')

matplotlib.rcParams['figure.figsize'] = (12,8)
%matplotlib inline

from sklearn.preprocessing import Imputer
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve,auc
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score

import datetime
import time
In [194]:
# calculate the score of model
def score(fpr,tpr): 
    score=0.4*tpr[np.where(fpr>=0.001)[0][0]]+0.3*tpr[np.where(fpr>=0.005)[0][0]]+0.3*tpr[np.where(fpr>=0.01)[0][0]] 
    return score 
In [202]:
# draw roc and pr curve
def roc_and_pr(y_te, y_score):
    fpr, tpr, threshold = roc_curve(y_te, y_score) 
    roc_auc = auc(fpr, tpr)   
    lw = 2
    
    plt.figure(figsize = (15,8))
    plt.subplot(121)
    plt.plot(fpr, tpr, color='darkorange',  lw=lw, label='ROC curve (area = %0.3f)' % roc_auc)   
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')  
    plt.xlim([0.0, 1.0])  
    plt.ylim([0.0, 1.05])  
    plt.xlabel('False Positive Rate')  
    plt.ylabel('True Positive Rate')  
    plt.title('ROC')  
    plt.legend(loc="lower right")  

    precision, recal, thresholds = precision_recall_curve(y_te, y_score)
    average_precision = average_precision_score(y_te, y_score)

    plt.subplot(122)
    plt.step(recal, precision, color='b', alpha=0.2,where='post')
    plt.fill_between(recal, precision, step='post', alpha=0.2,color='b')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.title('PR curve: AP={0:0.3f}'.format(average_precision))
In [5]:
data = pd.read_csv('atec_anti_fraud_train.csv')
In [6]:
data.head()
Out[6]:
id label date f1 f2 f3 f4 f5 f6 f7 ... f288 f289 f290 f291 f292 f293 f294 f295 f296 f297
0 f10eb20f31cf7063ee8bdbd1272214e4d7e0193c8dbce4... 0 20171103 0 0 0 0 100807.0 0 5 ... 301.0 312.0 328.0 85.0 302.0 201.0 203.0 203.0 61.0 201.0
1 d861929b67938d06538b910b9f6b85f5eb62b6ad7361ba... 0 20170917 0 1 1 1 100805.0 1 5 ... 302.0 324.0 391.0 13.0 302.0 160.0 160.0 161.0 8.0 160.0
2 1270cb8a85eedd57672b2c6297fa5633e36773a2c3a351... 0 20171022 0 0 1 0 100102.0 0 6 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 9fa009724ee7ff9d688ae321304fbc78f608cdabbfdd2b... 0 20171029 0 0 0 1 100807.0 1 4 ... 302.0 322.0 341.0 57.0 251.0 175.0 176.0 176.0 49.0 150.0
4 1da482485d7e8bcefae7e9d0d1167cec3ac111cfa71d8b... 0 20171002 1 1 0 1 100805.0 1 5 ... 302.0 301.0 301.0 74.0 302.0 182.0 181.0 182.0 51.0 181.0

5 rows × 300 columns

In [6]:
data.shape
Out[6]:
(994731, 300)

1. Exploratory Analysis

1.1 date

In [7]:
date_df = data.iloc[:,1:3]
In [8]:
date_df.head()
Out[8]:
label date
0 0 20171103
1 0 20170917
2 0 20171022
3 0 20171029
4 0 20171002
In [9]:
def strptime(date):
    t = time.strptime(str(date), "%Y%m%d")
    return(t)

data_strptime = date_df.date.apply(strptime)
In [10]:
date_df['year'] = [data_strptime[i][0] for i in range(len(data_strptime))]
date_df['month'] = [data_strptime[i][1] for i in range(len(data_strptime))]
date_df['day'] = [data_strptime[i][2] for i in range(len(data_strptime))]
date_df['weekday'] = [data_strptime[i][6] for i in range(len(data_strptime))]
date_df['strptime'] = data_strptime
In [11]:
date_df.head()
Out[11]:
label date year month day weekday strptime
0 0 20171103 2017 11 3 4 (2017, 11, 3, 0, 0, 0, 4, 307, -1)
1 0 20170917 2017 9 17 6 (2017, 9, 17, 0, 0, 0, 6, 260, -1)
2 0 20171022 2017 10 22 6 (2017, 10, 22, 0, 0, 0, 6, 295, -1)
3 0 20171029 2017 10 29 6 (2017, 10, 29, 0, 0, 0, 6, 302, -1)
4 0 20171002 2017 10 2 0 (2017, 10, 2, 0, 0, 0, 0, 275, -1)
In [29]:
date_df = date_df.sort_values(by='strptime')
In [30]:
date_df.head()
Out[30]:
label date year month day weekday strptime
937719 0 20170905 2017 9 5 1 (2017, 9, 5, 0, 0, 0, 1, 248, -1)
936071 0 20170905 2017 9 5 1 (2017, 9, 5, 0, 0, 0, 1, 248, -1)
741462 0 20170905 2017 9 5 1 (2017, 9, 5, 0, 0, 0, 1, 248, -1)
814385 0 20170905 2017 9 5 1 (2017, 9, 5, 0, 0, 0, 1, 248, -1)
272696 0 20170905 2017 9 5 1 (2017, 9, 5, 0, 0, 0, 1, 248, -1)
In [31]:
date_df.tail()
Out[31]:
label date year month day weekday strptime
663613 0 20171105 2017 11 5 6 (2017, 11, 5, 0, 0, 0, 6, 309, -1)
197795 0 20171105 2017 11 5 6 (2017, 11, 5, 0, 0, 0, 6, 309, -1)
197796 0 20171105 2017 11 5 6 (2017, 11, 5, 0, 0, 0, 6, 309, -1)
392557 0 20171105 2017 11 5 6 (2017, 11, 5, 0, 0, 0, 6, 309, -1)
742380 0 20171105 2017 11 5 6 (2017, 11, 5, 0, 0, 0, 6, 309, -1)

* The data is from 2017-09-05 to 2017-11-05.

split the data based on date.

train data: date between [ 20170905, 20171020]

test data: date between [ 20171021, 20171105]

Almost 3 : 1.

Store as csv

In [90]:
thres = time.strptime(str(20171020), "%Y%m%d")
train1020_with_unlabel = data[date_df['strptime'] <=thres]
test1020_with_unlabel = data[date_df['strptime'] >thres]

data_without_unlabel = data[data['label'] != -1]
date_ = date_df[date_df['label'] != -1 ]

train1020_without_unlabel = data_without_unlabel[date_['strptime'] <=thres]
test1020_without_unlabel = data_without_unlabel[date_['strptime'] >thres]
In [89]:
train1020_with_unlabel.to_csv('/Users/lijh/Downloads/ATEC/train1020_with_unlabel.csv',index=False)
test1020_with_unlabel.to_csv('/Users/lijh/Downloads/ATEC/test1020_with_unlabel.csv',index=False)
train1020_without_unlabel.to_csv('/Users/lijh/Downloads/ATEC/train1020_only_label.csv',index=False)
test1020_without_unlabel.to_csv('/Users/lijh/Downloads/ATEC/test1020_only_label.csv',index=False)

1.1.1 weekday

In [91]:
label_0 = date_df[date_df['label'] == 0]
label_1 = date_df[date_df['label'] == 1]
unlabel = date_df[date_df['label'] == -1]
In [93]:
plt.rcParams['figure.figsize'] = (20,15)

plt.subplot(221)
date_df.weekday.hist(bins = 13, density = True)
plt.title('label = 0', fontsize = 30)

plt.subplot(222)
label_0.weekday.hist(bins = 13, density = True)
plt.title('label = 0', fontsize = 30)

plt.subplot(223)
label_1.weekday.hist(bins = 13, density = True)
plt.title('label = 1', fontsize = 30)
plt.xlabel('weekday',fontsize = 20)

plt.subplot(224)
unlabel.weekday.hist(bins = 13, density =True)
plt.title('label = -1', fontsize = 30)
plt.xlabel('weekday',fontsize = 20)
Out[93]:
Text(0.5,0,'weekday')
In [94]:
label0_weekday = []
label1_weekday = []
unlabel_weekday = []
for i in range(7):
    weekday_ = date_df[date_df['weekday']==i]
    num = len(weekday_)
    label0_weekday.append( len(weekday_[weekday_['label'] == 0]) / num )
    label1_weekday.append( len(weekday_[weekday_['label'] == 1]) / num )
    unlabel_weekday.append( len(weekday_[weekday_['label'] == -1]) / num )
In [95]:
each_weekday = pd.DataFrame({'label 0': label0_weekday, 'label 1': label1_weekday, 
                         'unlabel': unlabel_weekday})
In [99]:
cf.go_offline()
each_weekday[['label 1','unlabel']].iplot(kind='spread', title='The rate of label = 1 and unlabel in each weekday',yTitle = 'rate')

* Friday, Saturday and Sunday are more likely to having risk.

1.1.2 time in order

In [98]:
import pygal
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
from plotly import __version__
import cufflinks as cf
In [100]:
day = date_df['date'].value_counts().index.values
day.sort()
day
Out[100]:
array([20170905, 20170906, 20170907, 20170908, 20170909, 20170910,
       20170911, 20170912, 20170913, 20170914, 20170915, 20170916,
       20170917, 20170918, 20170919, 20170920, 20170921, 20170922,
       20170923, 20170924, 20170925, 20170926, 20170927, 20170928,
       20170929, 20170930, 20171001, 20171002, 20171003, 20171004,
       20171005, 20171006, 20171007, 20171008, 20171009, 20171010,
       20171011, 20171012, 20171013, 20171014, 20171015, 20171016,
       20171017, 20171018, 20171019, 20171020, 20171021, 20171022,
       20171023, 20171024, 20171025, 20171026, 20171027, 20171028,
       20171029, 20171030, 20171031, 20171101, 20171102, 20171103,
       20171104, 20171105])
In [101]:
label0_day = []
label1_day = []
unlabel_day = []
for i in day:
    day_ = date_df[date_df['date']==i]
    num = len(day_)
    label0_day.append( len(day_[day_['label'] == 0]) / num )
    label1_day.append( len(day_[day_['label'] == 1]) / num )
    unlabel_day.append( len(day_[day_['label'] == -1]) / num )
In [102]:
each_day = pd.DataFrame({'day': day, 'label 0': label0_day, 'label 1': label1_day, 
                         'unlabel': unlabel_day})
In [103]:
ind = []
for i in range(26):
    ind.append( 'Sep ' + str(i+5) )
    
for i in range(31):
    ind.append( 'Oct ' + str(i+1) )

for i in range(5):
    ind.append( 'Nov ' + str(i+1) )
In [104]:
each_day.index = ind
In [105]:
cf.go_offline()
each_day[['label 1','unlabel']].iplot(kind='spread', title='The rate of label = 1 and unlabel in each day',yTitle = 'rate')

2. Modeling

In [7]:
X_raw = data.iloc[:,3:]
y_raw = data.iloc[:,1]
In [146]:
pca = PCA(n_components = 2)

2.1 Missing value

2.1.2 Impute by -1

In [41]:
X_minus = X_raw.copy()

X_minus[X_minus.isna()] = -1

X_label_1 = X_minus[y_raw != -1]

y_label = y_raw[y_raw != -1]
In [15]:
rnd_clf_9 = RandomForestClassifier(n_estimators=100, max_leaf_nodes=20, n_jobs = -1, random_state = 7, 
                                 class_weight = 'balanced')
rnd_clf_9.fit(X_label_1, y_label)
Out[15]:
RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=20, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=-1, oob_score=False, random_state=7,
            verbose=0, warm_start=False)
In [17]:
df_9 = pd.DataFrame({'feature': X_label_1.columns, 
                   'importance': rnd_clf_9.feature_importances_ })
df_9 = df_9.sort_values(by='importance', ascending =False)

df_9_ = df_9[df_9['importance'] != 0 ]
print(str(len(df_9_)) + ' features importance != 0')

sns.set(rc={'figure.figsize':(10,10)})
sns.set(font_scale=2)                                           # set the font size 
ax = sns.barplot(y = 'feature', x = 'importance',data=df_9[df_9['importance'] >= 0.01 ])        #draw the barplot
ax.set(title = " ", xlabel = "feature", ylabel = "importance")
ax.set_xticklabels(ax.get_xticklabels(), rotation = 90, fontsize = 0.01)
ax.tick_params(labelsize=20) 
197 features importance != 0

2.1.2 Impute by median

In [25]:
X = X_raw.copy()
imputer = Imputer(strategy = 'median')
imputer.fit(X)
X_norm = imputer.transform(X)
X = pd.DataFrame(X_norm, columns = X.columns)

2.2 Random forest for feature importance

  • We use labeled data first.
In [26]:
X_label = X[y_raw != -1]
X_label.shape
Out[26]:
(990006, 297)
In [27]:
y_label = y_raw[y_raw != -1]
y_label.shape
Out[27]:
(990006,)
  • Using three different methods for unbalanced problem.

2.2.1 class_weight = 'balanced_subsample'

In [28]:
rnd_clf_1 = RandomForestClassifier(n_estimators=100, max_leaf_nodes=20, n_jobs = -1, random_state = 7, 
                                 class_weight = 'balanced_subsample')
rnd_clf_1.fit(X_label, y_label)
Out[28]:
RandomForestClassifier(bootstrap=True, class_weight='balanced_subsample',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=20, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=-1, oob_score=False, random_state=7,
            verbose=0, warm_start=False)
In [36]:
df_1 = pd.DataFrame({'feature': X_label.columns, 
                   'importance': rnd_clf_1.feature_importances_ })
df_1 = df_1.sort_values(by='importance', ascending =False)

df_1_ = df_1[df_1['importance'] != 0 ]
print(str(len(df_1_)) + ' features importance != 0')

sns.set(rc={'figure.figsize':(10,10)})
sns.set(font_scale=2)                                           # set the font size 
ax = sns.barplot(y = 'feature', x = 'importance',data=df_1[df_1['importance'] >= 0.01 ])        #draw the barplot
ax.set(title = " ", xlabel = "feature", ylabel = "importance")
ax.set_xticklabels(ax.get_xticklabels(), rotation = 90, fontsize = 0.01)
ax.tick_params(labelsize=20) 
167 features importance != 0

2.2.3 class_weight = 'balanced'

In [37]:
rnd_clf_2 = RandomForestClassifier(n_estimators=100, max_leaf_nodes=20, n_jobs = -1, random_state = 7, 
                                 class_weight = 'balanced')
rnd_clf_2.fit(X_label, y_label)
Out[37]:
RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=20, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=-1, oob_score=False, random_state=7,
            verbose=0, warm_start=False)
In [40]:
df_2 = pd.DataFrame({'feature': X_label.columns, 
                   'importance': rnd_clf_2.feature_importances_ })
df_2 = df_2.sort_values(by='importance', ascending =False)

df_2_ = df_2[df_2['importance'] != 0 ]
print(str(len(df_2_)) + ' features importance != 0')

sns.set(rc={'figure.figsize':(10,10)})
sns.set(font_scale=2)                                           # set the font size 
ax = sns.barplot(y = 'feature', x = 'importance', data=df_2[df_2['importance'] >= 0.01] )        #draw the barplot
ax.set(title = " ", xlabel = "feature", ylabel = "importance")
ax.set_xticklabels(ax.get_xticklabels(), rotation = 90, fontsize = 0.01)
ax.tick_params(labelsize=20) 
167 features importance != 0
In [41]:
t = pd.concat([df_1_, df_2_], axis = 1)
diff = []
for i in range(len(t)):
    if True in t.isnull().values[i]:
        diff.append(i)
t.iloc[diff,:]
Out[41]:
feature importance feature importance
55 f56 0.00041 NaN NaN
189 NaN NaN f190 0.000067
  • The different for these two methods are whether f56 and f190 are in the model or not.

2.2.3 fit( sample_weight )

In [42]:
num1 = sum(y_label == 1)
num0 = sum(y_label == 0)
           
print("label=1: " + str(num1) + "\nlabel=0: " + str(num0)+ 
      "\nratio: " + str( num0 / num1 ))
label=1: 12122
label=0: 977884
ratio: 80.67018643788154

2.2.3.1 weight: 80 : 1

In [43]:
weight_1 = np.ones(len(y_label))
weight_1[y_label == 0] = 1 / ( num1*80 + num0 )
weight_1[y_label == 1] = 80 / ( num1*80 + num0 )
sum(weight_1)
Out[43]:
1.0000000000195897
In [44]:
rnd_clf_3 = RandomForestClassifier(n_estimators=100, max_leaf_nodes=20, n_jobs = -1, random_state = 7)
rnd_clf_3.fit(X_label, y_label, sample_weight = weight_1)
Out[44]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=20,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=7, verbose=0, warm_start=False)
In [45]:
df_3 = pd.DataFrame({'feature': X_label.columns, 
                   'importance': rnd_clf_3.feature_importances_ })
df_3 = df_3.sort_values(by='importance', ascending =False)

df_3_ = df_3[df_3['importance'] != 0 ]
print(str(len(df_3_)) + ' features importance != 0')

sns.set(rc={'figure.figsize':(10,10)})
sns.set(font_scale=2)                                           # set the font size 
ax = sns.barplot(y = 'feature', x = 'importance', data=df_3[df_3['importance'] >= 0.01] )        #draw the barplot
ax.set(title = " ", xlabel = "feature", ylabel = "importance")
ax.set_xticklabels(ax.get_xticklabels(), rotation = 90, fontsize = 0.01)
ax.tick_params(labelsize=20) 
168 features importance != 0
In [46]:
t2 = pd.concat([df_1_, df_2_, df_3_], axis = 1)
diff = []
for i in range(len(t)):
    if True in t2.isnull().values[i]:
        diff.append(i)
t2.iloc[diff,:]
Out[46]:
feature importance feature importance feature importance
55 f56 0.000410 NaN NaN f56 0.000410
56 NaN NaN NaN NaN f57 0.000077
88 NaN NaN NaN NaN f89 0.000037
102 f103 0.000059 f103 0.000058 NaN NaN
139 NaN NaN NaN NaN f140 0.000080
169 f170 0.000127 f170 0.000126 NaN NaN
189 NaN NaN f190 0.000067 NaN NaN

2.2.3.1 weight : 30 : 1

In [47]:
weight_2 = np.ones(len(y_label))
weight_2[y_label == 0] = 1 / ( num1*30 + num0 )
weight_2[y_label == 1] = 30 / ( num1*30 + num0 )
sum(weight_2)
Out[47]:
1.0000000000108022
In [48]:
rnd_clf_4 = RandomForestClassifier(n_estimators=100, max_leaf_nodes=20, n_jobs = -1, random_state = 7)
rnd_clf_4.fit(X_label, y_label, sample_weight = weight_2)
Out[48]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=20,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=7, verbose=0, warm_start=False)
In [51]:
df_4 = pd.DataFrame({'feature': X_label.columns, 
                   'importance': rnd_clf_4.feature_importances_ })
df_4 = df_4.sort_values(by='importance', ascending =False)

df_4_ = df_4[df_4['importance'] != 0 ]
print(str(len(df_4_)) + ' features importance != 0')

sns.set(rc={'figure.figsize':(10,10)})
sns.set(font_scale=2)                                           # set the font size 
ax = sns.barplot(y = 'feature', x = 'importance', data=df_4[df_4['importance'] >= 0.01] )        #draw the barplot
ax.set(title = " ", xlabel = "feature", ylabel = "importance")
ax.set_xticklabels(ax.get_xticklabels(), rotation = 90, fontsize = 0.01)
ax.tick_params(labelsize=20) 
178 features importance != 0
In [52]:
t3 = pd.concat([df_1_, df_2_, df_3_, df_4_], axis = 1)
diff = []
for i in range(len(t)):
    if True in t3.isnull().values[i]:
        diff.append(i)
t3.iloc[diff,:].head()
Out[52]:
feature importance feature importance feature importance feature importance
3 NaN NaN NaN NaN NaN NaN f4 0.001056
7 NaN NaN NaN NaN NaN NaN f8 0.000071
18 NaN NaN NaN NaN NaN NaN f19 0.000206
55 f56 0.00041 NaN NaN f56 0.000410 NaN NaN
56 NaN NaN NaN NaN f57 0.000077 f57 0.000131
In [58]:
print("There are " + str(len(t3.iloc[diff,:])) + " differences.")
There are 34 differences.
  • Conclusion: balanced_subsample, balanced or the sample_weight = 80:1 are the same. But if unbalancing, the results differ a lot.
In [59]:
pd.concat([df_1, df_2, df_3, df_4], axis = 1).loc[35:46,:]
Out[59]:
feature importance feature importance feature importance feature importance
35 f36 0.0 f36 0.0 f36 0.0 f36 0.0
36 f37 0.0 f37 0.0 f37 0.0 f37 0.0
37 f38 0.0 f38 0.0 f38 0.0 f38 0.0
38 f39 0.0 f39 0.0 f39 0.0 f39 0.0
39 f40 0.0 f40 0.0 f40 0.0 f40 0.0
40 f41 0.0 f41 0.0 f41 0.0 f41 0.0
41 f42 0.0 f42 0.0 f42 0.0 f42 0.0
42 f43 0.0 f43 0.0 f43 0.0 f43 0.0
43 f44 0.0 f44 0.0 f44 0.0 f44 0.0
44 f45 0.0 f45 0.0 f45 0.0 f45 0.0
45 f46 0.0 f46 0.0 f46 0.0 f46 0.0
46 f47 0.0 f47 0.0 f47 0.0 f47 0.0
  • The importance of f36 - f47, which have lots of NA, are all 0。 So we can drop them directly.

2.3 Prediction

2.3.1 Without using unlabel data

In [60]:
y_tr, y_te = train_test_split(y_label, test_size = 0.2, random_state =42)
print(len(y_tr), "train+", len(y_te), 'test')
792004 train+ 198002 test
In [61]:
X_tr = X.loc[y_tr.index,:]
X_te = X.loc[y_te.index,:]
In [62]:
rnd_clf_5 = RandomForestClassifier(n_estimators=100, max_leaf_nodes=20, n_jobs = -1, random_state = 7, 
                                 class_weight = 'balanced')
In [63]:
rnd_clf_5.fit(X_tr, y_tr)
Out[63]:
RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=20, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=-1, oob_score=False, random_state=7,
            verbose=0, warm_start=False)
In [64]:
y_pred_1 = rnd_clf_5.predict(X_te)
In [65]:
confusion_matrix(y_te, y_pred_1)
Out[65]:
array([[178645,  16876],
       [   251,   2230]])
In [66]:
y_pred_prob_1 = rnd_clf_5.predict_proba(X_te)
In [67]:
roc_and_pr(y_te, y_pred_prob_1[:,1])
Out[67]:
Text(0.5,1,'PR curve: AP=0.449')

2.3.2 Set the unlabel value = 1

  • Since the unlabel is the high risk one, so we set the unlabel value = 1 first.
In [12]:
ind_unlabel = y_raw == -1
In [69]:
X_unlabel = X[ind_unlabel]
In [70]:
X_unlabel.shape
Out[70]:
(4725, 297)
In [71]:
X_2 = pd.concat([X_tr, X_unlabel])
In [106]:
y_unlabel_1 = pd.DataFrame(np.ones(sum(ind_unlabel)))
In [74]:
y_2 = pd.concat([y_tr, y_unlabel_1]).iloc[:,0]
In [75]:
rnd_clf_6 = RandomForestClassifier(n_estimators=100, max_leaf_nodes=20, n_jobs = -1, random_state = 7, 
                                 class_weight = 'balanced')
In [76]:
rnd_clf_6.fit(X_2, y_2)
Out[76]:
RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=20, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=-1, oob_score=False, random_state=7,
            verbose=0, warm_start=False)
In [79]:
y_pred_2 = rnd_clf_6.predict(X_te)
In [80]:
confusion_matrix(y_te, y_pred_2)
Out[80]:
array([[170432,  25089],
       [   190,   2291]])
In [81]:
y_pred_prob_2 = rnd_clf_6.predict_proba(X_te)
In [196]:
roc_and_pr(y_te, y_pred_prob_2[:,1])
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-196-431b8459c491> in <module>()
----> 1 roc_and_pr(y_te, y_pred_prob_2)

NameError: name 'y_te' is not defined

2.3.2 using the train & test data splited by time split; only labeled data

In [219]:
X_tr_1020 = train1020_without_unlabel.iloc[:,3:]
In [220]:
y_tr_1020 = train1020_without_unlabel.iloc[:,1]
In [221]:
X_te_1020 = test1020_without_unlabel.iloc[:,3:]
In [222]:
y_te_1020 = test1020_without_unlabel.iloc[:,1]
In [223]:
X_tr_1020 = X_tr_1020.fillna(-1)
In [224]:
X_te_1020 = X_te_1020.fillna(-1)
In [115]:
rnd_clf_7 = RandomForestClassifier(n_estimators=100, n_jobs = -1, random_state = 7, 
                                 class_weight = 'balanced')
In [116]:
rnd_clf_7.fit(X_tr_1020, y_tr_1020)
Out[116]:
RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=-1, oob_score=False, random_state=7,
            verbose=0, warm_start=False)
In [117]:
y_pred_3 = rnd_clf_7.predict(X_te_1020)
In [118]:
confusion_matrix(y_te_1020, y_pred_3)
Out[118]:
array([[256188,    359],
       [  2580,    833]])
In [119]:
y_pred_prob_3 = rnd_clf_7.predict_proba(X_te_1020)
In [121]:
roc_and_pr(y_te_1020, y_pred_prob_3[:,1])
Out[121]:
Text(0.5,1,'PR curve: AP=0.525')
In [122]:
score(fpr3, tpr3)
Out[122]:
0.4510694403750366

2.4 肖颂凯的猜想大实验

In [165]:
# data_without_unlabel = data[data['label'] != -1]
# date_ = date_df[date_df['label'] != -1 ]

thres1 = time.strptime(str(20170920), "%Y%m%d")
thres2 = time.strptime(str(20171005), "%Y%m%d")
thres3 = time.strptime(str(20171020), "%Y%m%d")

train11 = data_without_unlabel[date_['strptime'] <=thres1]


test11 = data_without_unlabel[ date_['strptime'] > thres3 ]
In [173]:
train22 = data_without_unlabel[(date_['strptime'] >thres1) & (date_['strptime'] <= thres2)]
In [174]:
train33 = data_without_unlabel[(date_['strptime'] >thres2) & (date_['strptime'] <= thres3)]

2.4.1

In [166]:
X_tr_11 = train11.iloc[:,3:]
y_tr_11 = train11.iloc[:,1]
X_te_11 = test11.iloc[:,3:]
y_te_11 = test11.iloc[:,1]

X_tr_11 = X_tr_11.fillna(-1)
X_te_11 = X_te_11.fillna(-1)
In [167]:
rnd_clf_8 = RandomForestClassifier(n_estimators=100, n_jobs = -1, random_state = 7, 
                                 class_weight = 'balanced')
In [168]:
rnd_clf_8.fit(X_tr_11, y_tr_11)
Out[168]:
RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=-1, oob_score=False, random_state=7,
            verbose=0, warm_start=False)
In [169]:
y_pred_4 = rnd_clf_8.predict(X_te_11)
In [170]:
confusion_matrix(y_te_11, y_pred_4)
Out[170]:
array([[256432,    115],
       [  3119,    294]])
In [171]:
y_pred_prob_4= rnd_clf_8.predict_proba(X_te_11)
In [204]:
roc_and_pr(y_te_11, y_pred_prob_4[:,1])
In [185]:
score(fpr4, tpr4)
Out[185]:
0.35438031057720476

2.4.2

In [180]:
X_tr_22 = train22.iloc[:,3:]
y_tr_22 = train22.iloc[:,1]

X_tr_22 = X_tr_22.fillna(-1)

rnd_clf_10 = RandomForestClassifier(n_estimators=100, n_jobs = -1, random_state = 7, 
                                 class_weight = 'balanced')
In [181]:
rnd_clf_10.fit(X_tr_22, y_tr_22)
Out[181]:
RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=-1, oob_score=False, random_state=7,
            verbose=0, warm_start=False)
In [182]:
y_pred_5 = rnd_clf_10.predict(X_te_11)
confusion_matrix(y_te_11, y_pred_5)
Out[182]:
array([[256203,    344],
       [  2837,    576]])
In [203]:
y_pred_prob_5= rnd_clf_10.predict_proba(X_te_11)

roc_and_pr(y_te_11, y_pred_prob_5[:,1])
In [186]:
score(fpr5, tpr5)
Out[186]:
0.3651333138001758

2.4.3

In [188]:
X_tr_33 = train33.iloc[:,3:]
y_tr_33 = train33.iloc[:,1]

X_tr_33 = X_tr_33.fillna(-1)

rnd_clf_11 = RandomForestClassifier(n_estimators=100, n_jobs = -1, random_state = 7, 
                                 class_weight = 'balanced')
In [189]:
rnd_clf_11.fit(X_tr_33, y_tr_33)
Out[189]:
RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=-1, oob_score=False, random_state=7,
            verbose=0, warm_start=False)
In [190]:
y_pred_6 = rnd_clf_11.predict(X_te_11)
confusion_matrix(y_te_11, y_pred_6)
Out[190]:
array([[256302,    245],
       [  2746,    667]])
In [205]:
y_pred_prob_6= rnd_clf_11.predict_proba(X_te_11)

roc_and_pr(y_te_11, y_pred_prob_6[:,1])
In [193]:
score(fpr6,tpr6)
Out[193]:
0.46003515968356284

2.5 姜山的猜想大实验

Using the train and the data in 2.3.2, that is, split the train and test set by the date, train : test = 3:1

In [225]:
# rnd_clf_7 = RandomForestClassifier(n_estimators=100, n_jobs = -1, random_state = 7, 
#                                    class_weight = 'balanced')

# rnd_clf_7.fit(X_tr_1020, y_tr_1020)

# y_pred_3 = rnd_clf_7.predict(X_te_1020)
In [226]:
confusion_matrix(y_te_1020, y_pred_3)
Out[226]:
array([[256188,    359],
       [  2580,    833]])
In [227]:
y_pred_prob_3 = rnd_clf_7.predict_proba(X_te_1020)
In [228]:
roc_and_pr(y_te_1020, y_pred_prob_3[:,1])
In [243]:
 
In [20]:
def jiangshan( y_score ):
    ind_add = y_score > 0.9
    X_add = pd.concat([X_tr_1020, X_te_1020[ind_add]])
    print(len(X_add_1) - len(X_tr_1020) )
    y_add = pd.concat([y_tr_1020, pd.DataFrame(np.ones(sum(ind_add))) ]).iloc[:,0]
    rnd_clf_js = RandomForestClassifier(n_estimators=100, n_jobs = -1, random_state = 7, 
                                   class_weight = 'balanced')

    rnd_clf_js.fit(X_add, y_add)

    y_pred_js = rnd_clf_js.predict(X_te_1020)
    print( confusion_matrix(y_te_1020, y_pred_js) )
    y_pred_prob_js = rnd_clf_12.predict_proba(X_te_1020)
    roc_and_pr(y_te_1020, y_pred_prob_js[:,1])
    return(y_pred_prob_js)
In [247]:
jiangshan(y_pred_prob_7[:,1])
8
[[256177    370]
 [  2555    858]]
Out[247]:
array([[1.  , 0.  ],
       [1.  , 0.  ],
       [0.91, 0.09],
       ...,
       [1.  , 0.  ],
       [1.  , 0.  ],
       [0.96, 0.04]])

3 test

3.1

In [8]:
test = pd.read_csv('atec_anti_fraud_test_a.csv')
In [9]:
X_test = test.iloc[:,2:]
In [10]:
X_test_ = X_test.copy()
In [86]:
X_test_norm = imputer.transform(X_test_)
X_test_ = pd.DataFrame(X_test_norm, columns = X_test_.columns)
In [87]:
y_test_prob = rnd_clf_1.predict_proba(X_test_)

score = y_test_prob[:,1]

In [89]:
final = pd.DataFrame({'id':test.iloc[:,0], 'score': score})
In [90]:
final.head()
Out[90]:
id score
0 8e8290c270ec4bc3448dd5edd35c6f059b42d38f9ddd6f... 0.231247
1 f36c5fa5c0e7afccf733a4d74c7e06ffe43cc8fd24eda7... 0.357207
2 ee85f808b1fd49eaba308527e1686c509dc8e3e5057488... 0.112613
3 0df905aa187938d56a9b0816b13f54ac1f87d658a33cd0... 0.127775
4 15f532f979c4f092bbbe28e5409c8c3b8454ece8f1ab3e... 0.117424
In [91]:
final.to_csv('/Users/lijh/Downloads/ATEC/final.csv',index=False)

3.2

In [108]:
y_3 = y_raw.copy()
In [111]:
y_3[ind_unlabel] = 1
In [113]:
rnd_clf_7 = RandomForestClassifier(n_estimators=100, max_leaf_nodes=20, n_jobs = -1, random_state = 7, 
                                 class_weight = 'balanced')
In [115]:
rnd_clf_7.fit(X, y_3)
Out[115]:
RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=20, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=-1, oob_score=False, random_state=7,
            verbose=0, warm_start=False)
In [116]:
y_test_prob_2 = rnd_clf_7.predict_proba(X_test_)
In [117]:
score_2 = y_test_prob_2[:,1]
In [118]:
final_2 = pd.DataFrame({'id':test.iloc[:,0], 'score': score_2})
In [19]:
final_2.head()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-19-fefbe46562c9> in <module>()
----> 1 final_2.head()

NameError: name 'final_2' is not defined
In [120]:
final_2.to_csv('/Users/lijh/Downloads/ATEC/final2.0.csv',index=False)

3.3

In [130]:
final_3 = pd.DataFrame({'id':test.iloc[:,0]})
In [131]:
final_3['score'] = 1
In [136]:
final_3.head()
Out[136]:
id score
0 8e8290c270ec4bc3448dd5edd35c6f059b42d38f9ddd6f... 1
1 f36c5fa5c0e7afccf733a4d74c7e06ffe43cc8fd24eda7... 1
2 ee85f808b1fd49eaba308527e1686c509dc8e3e5057488... 1
3 0df905aa187938d56a9b0816b13f54ac1f87d658a33cd0... 1
4 15f532f979c4f092bbbe28e5409c8c3b8454ece8f1ab3e... 1
In [132]:
final_3.to_csv('/Users/lijh/Downloads/ATEC/final3.0.csv',index=False)

3.4

In [133]:
final_4 = pd.DataFrame({'id':test.iloc[:,0]})
In [134]:
final_4['score'] = 0
In [137]:
final_4.head()
Out[137]:
id score
0 8e8290c270ec4bc3448dd5edd35c6f059b42d38f9ddd6f... 0
1 f36c5fa5c0e7afccf733a4d74c7e06ffe43cc8fd24eda7... 0
2 ee85f808b1fd49eaba308527e1686c509dc8e3e5057488... 0
3 0df905aa187938d56a9b0816b13f54ac1f87d658a33cd0... 0
4 15f532f979c4f092bbbe28e5409c8c3b8454ece8f1ab3e... 0
In [135]:
final_4.to_csv('/Users/lijh/Downloads/ATEC/final4.0.csv',index=False)

3.5

In [13]:
y_5 = y_raw.copy()
y_5[ind_unlabel] = 1
In [14]:
X_5 = X_raw.copy()
In [15]:
X_5 = X_5.fillna(-1)
In [16]:
X_test_5 = X_test.copy()
In [19]:
X_test_5 = X_test_5.fillna(-1)
In [28]:
rnd_clf_20 = RandomForestClassifier(n_estimators=100, n_jobs = -1, random_state = 7, 
                                 class_weight = 'balanced')
In [30]:
rnd_clf_20.fit(X_5, y_5)
Out[30]:
RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=-1, oob_score=False, random_state=7,
            verbose=0, warm_start=False)
In [31]:
y_test_prob_5 = rnd_clf_20.predict_proba(X_test_5)
score_5 = y_test_prob_5[:,1]
In [40]:
def jiangshan2( y_score ):
    ind_add = y_score > 0.7
    X_add = pd.concat([X_5, X_test_5[ind_add]])
    print(len(X_add) - len(X_5) )
    y_add = pd.concat([y_5, pd.DataFrame(np.ones(sum(ind_add))) ]).iloc[:,0]
    rnd_clf_js = RandomForestClassifier(n_estimators=100, n_jobs = -1, random_state = 7, 
                                   class_weight = 'balanced')

    rnd_clf_js.fit(X_add, y_add)
    y_pred_prob_js = rnd_clf_js.predict_proba(X_test_5)
    return(y_pred_prob_js)
In [37]:
score_5 = jiangshan2(score_5)[:,1]
2
In [38]:
score_5 = jiangshan2(score_5)[:,1]
4
In [39]:
score_5 = jiangshan2(score_5)[:,1]
5
In [41]:
score_5 = jiangshan2(score_5)[:,1]
64
In [42]:
score_5 = jiangshan2(score_5)[:,1]
108
In [44]:
count = 1
while count < 20:
    score_5 = jiangshan2(score_5)[:,1]
    count = count + 1
145
177
199
216
238
263
278
299
310
317
327
333
341
343
355
359
362
370
375

#

In [138]:
test_strptime = test.date.apply(strptime)
In [139]:
test_df = test.copy()
In [140]:
test_df['strptime'] = test_strptime
In [141]:
test_df = test_df.sort_values(by='strptime')
############### 2018-06-04 ########## To be continue ####### By: Li Jiahui

#

In [121]:
y_label = y_raw[y_raw != -1]
In [373]:
 
792004 train+ 198002 test
In [375]:

20000 data

In [389]:
X_tr_part = X_tr.iloc[0:40000,:]
y_tr_part = y_tr[0:40000]
In [377]:
X_tr_part_imp = X_tr_part.iloc[:,ind]

unlabel data all = 1

In [378]:
y_unlabel_1 = y_unlabel.copy()
In [381]:
 
/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/api.py:87: RuntimeWarning: '<' not supported between instances of 'str' and 'int', sort order is undefined for incomparable objects
  result = result.union(other)
In [383]:
 
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-383-bb7f4534ef59> in <module>()
----> 1 rnd_clf.fit(X, y1)

/anaconda3/lib/python3.6/site-packages/sklearn/ensemble/forest.py in fit(self, X, y, sample_weight)
    245         """
    246         # Validate or convert input data
--> 247         X = check_array(X, accept_sparse="csc", dtype=DTYPE)
    248         y = check_array(y, accept_sparse='csc', ensure_2d=False, dtype=None)
    249         if sample_weight is not None:

/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    451                              % (array.ndim, estimator_name))
    452         if force_all_finite:
--> 453             _assert_all_finite(array)
    454 
    455     shape_repr = _shape_repr(array.shape)

/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py in _assert_all_finite(X)
     42             and not np.isfinite(X).all()):
     43         raise ValueError("Input contains NaN, infinity"
---> 44                          " or a value too large for %r." % X.dtype)
     45 
     46 

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').
In [234]:
 
Out[234]:
array([[170059,  25462],
       [   482,   1999]])

semi based on EM

First time

In [221]:
y_unlabel_2 = rnd_clf.predict(X_unlabel)
In [222]:
y_unlabel_2 = pd.DataFrame(y_unlabel_2)
In [223]:
y2 = pd.concat([y_tr_part, y_unlabel_2]).iloc[:,0]
In [224]:
rnd_clf.fit(X2, y2)
Out[224]:
RandomForestClassifier(bootstrap=True, class_weight='balanced_subsample',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=16, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=-1, oob_score=False, random_state=7,
            verbose=0, warm_start=False)
In [225]:
y_pred_rf = rnd_clf.predict(X_te)
In [226]:
confusion_matrix(y_te, y_pred_rf)
Out[226]:
array([[171036,  24485],
       [   524,   1957]])

Second time

In [65]:
y_unlabel2 = rnd_clf_1.predict(X_unlabel)
y_unlabel2 = pd.DataFrame(y_unlabel2)
y_tr_2 = pd.concat([y_tr, y_unlabel2])
y_tr_2 = y_tr_2.iloc[:,0]
rnd_clf_2 = RandomForestClassifier(n_estimators=200, max_leaf_nodes=20, n_jobs = -1)
rnd_clf_2.fit(X_tr_add_unlabel, y_tr_2)
Out[65]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=20,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
In [70]:
y_pred_rf_2 = rnd_clf_2.predict(X_te)
In [71]:
confusion_matrix(y_te, y_pred_rf_2)
Out[71]:
array([[195517,      4],
       [  2458,     23]])

Third time

In [ ]:
y_unlabel = rnd_clf.predict(X_unlabel)
# 合并 label unlabel 
X_tr_3
y_tr_3
rnd_clf_3 = RandomForestClassifier(n_estimators=500, max_leaf_nodes=20, n_jobs = -1)

rnd_clf_3.fit(X_tr_3, y_tr_3)

y_pred_rf_3 = rnd_clf_3.predict(X_te)

confusion_matrix(y_te, y_pred_rf_3)

semi based on k-means

In [69]:
rnd_clf_2.fit(X_tr_add_unlabel, y_tr_2)
Out[69]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=20,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
In [ ]:
sklearn.cluster.KMeans(n_clusters=8, random_state=7, n_jobs= -1 )

semi based on graph regularization

In [98]:
a = [1, None, 2]
In [101]:
sum(a,skipna = True)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-101-72eef13ba884> in <module>()
----> 1 sum(a,skipna = True)

TypeError: sum() takes no keyword arguments